{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": true
   },
   "source": [
    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Select-variables-for-analysis\" data-toc-modified-id=\"Select-variables-for-analysis-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Select variables for analysis</a></span></li><li><span><a href=\"#Limit-data-to-subpopulation-of-states-in-19-C-West\" data-toc-modified-id=\"Limit-data-to-subpopulation-of-states-in-19-C-West-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Limit data to subpopulation of states in 19 C West</a></span><ul class=\"toc-item\"><li><span><a href=\"#Drop--variables-missing-all-values-&amp;-change-state-names\" data-toc-modified-id=\"Drop--variables-missing-all-values-&amp;-change-state-names-2.1\"><span class=\"toc-item-num\">2.1&nbsp;&nbsp;</span>Drop  variables missing all values &amp; change state names</a></span></li></ul></li><li><span><a href=\"#Export-data-to-Stata\" data-toc-modified-id=\"Export-data-to-Stata-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Export data to Stata</a></span></li></ul></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook takes the big V-Dem dataset (too book for Stata IC) and reduces it to a subset of relevant data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T19:59:21.753518Z",
     "start_time": "2021-05-01T19:59:21.381045Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T19:59:22.250832Z",
     "start_time": "2021-05-01T19:59:22.247416Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/otto/Google Drive/02_Stanford/00_Researching/16_SocialScientization/-01_SSH/00_replication/\n",
      "/Users/otto/Google Drive/02_Stanford/00_Researching/16_SocialScientization/-01_SSH/00_replication/00_data/\n"
     ]
    }
   ],
   "source": [
    "DIRECTORY = os.path.dirname(os.getcwd()) + '/'\n",
    "print(DIRECTORY)\n",
    "DATA = DIRECTORY + '00_data/'\n",
    "print(DATA)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select variables for analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T19:59:23.554770Z",
     "start_time": "2021-05-01T19:59:23.552408Z"
    }
   },
   "outputs": [],
   "source": [
    "variables = \"year country_name histname e_regiongeo v2exnamhog v2extithog v2lpname e_migdpgro e_migdppc e_miurbani e_miurbpop v2x_suffr e_polity2 v2x_gencs v2peprisch e_civil_war e_miinteco e_miinterc e_pt_coup v2x_liberal v3stnatant v3stcitlaw v3stnatbank v3stflag v3stcensus v3ststatag v3ststybcov v3ststybpub v3struinvadm v3ststeecap\" \n",
    "variables = variables.split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T19:59:29.689049Z",
     "start_time": "2021-05-01T19:59:27.455464Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3145: DtypeWarning: Columns (363,829,830) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n"
     ]
    }
   ],
   "source": [
    "vdem = pd.read_csv(DATA + \"vdem10.csv\", usecols=variables)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T19:59:37.981578Z",
     "start_time": "2021-05-01T19:59:37.972174Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(27013, 30)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vdem.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Limit data to subpopulation of states in 19 C West"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T19:59:41.733122Z",
     "start_time": "2021-05-01T19:59:41.720641Z"
    }
   },
   "outputs": [],
   "source": [
    "west = vdem.loc[(vdem['e_regiongeo'] < 5) | (vdem['e_regiongeo'] == 16)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T19:59:42.309112Z",
     "start_time": "2021-05-01T19:59:42.304622Z"
    }
   },
   "outputs": [],
   "source": [
    "west = west.loc[(west['year'] < 1915) & (west['year'] > 1799)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T19:59:43.029238Z",
     "start_time": "2021-05-01T19:59:43.025923Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3796, 30)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "west.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Drop  variables missing all values & change state names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T19:59:47.306901Z",
     "start_time": "2021-05-01T19:59:47.301112Z"
    }
   },
   "outputs": [],
   "source": [
    "west.dropna(how='all', axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T20:00:20.561355Z",
     "start_time": "2021-05-01T20:00:20.556146Z"
    }
   },
   "outputs": [],
   "source": [
    "west = west.dropna(how='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T20:00:21.059632Z",
     "start_time": "2021-05-01T20:00:21.056804Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3796, 29)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "west.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T19:59:48.227627Z",
     "start_time": "2021-05-01T19:59:48.223594Z"
    }
   },
   "outputs": [],
   "source": [
    "west.replace(\"Würtemberg\", \"Wuertemberg\", inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T20:00:27.235775Z",
     "start_time": "2021-05-01T20:00:27.215820Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country_name</th>\n",
       "      <th>year</th>\n",
       "      <th>histname</th>\n",
       "      <th>v2x_suffr</th>\n",
       "      <th>v2x_liberal</th>\n",
       "      <th>v2lpname</th>\n",
       "      <th>v2exnamhog</th>\n",
       "      <th>v2extithog</th>\n",
       "      <th>v2peprisch</th>\n",
       "      <th>v3stcensus</th>\n",
       "      <th>...</th>\n",
       "      <th>v2x_gencs</th>\n",
       "      <th>e_polity2</th>\n",
       "      <th>e_regiongeo</th>\n",
       "      <th>e_migdpgro</th>\n",
       "      <th>e_migdppc</th>\n",
       "      <th>e_miurbani</th>\n",
       "      <th>e_miurbpop</th>\n",
       "      <th>e_civil_war</th>\n",
       "      <th>e_miinteco</th>\n",
       "      <th>e_miinterc</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>362</th>\n",
       "      <td>Sweden</td>\n",
       "      <td>1800</td>\n",
       "      <td>Kingdom of Sweden</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.699</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.09</td>\n",
       "      <td>-10.0</td>\n",
       "      <td>2</td>\n",
       "      <td>-0.036</td>\n",
       "      <td>1151.0</td>\n",
       "      <td>0.058</td>\n",
       "      <td>145.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>363</th>\n",
       "      <td>Sweden</td>\n",
       "      <td>1801</td>\n",
       "      <td>Kingdom of Sweden</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.699</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.09</td>\n",
       "      <td>-10.0</td>\n",
       "      <td>2</td>\n",
       "      <td>-0.009</td>\n",
       "      <td>1141.0</td>\n",
       "      <td>0.059</td>\n",
       "      <td>146.46</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364</th>\n",
       "      <td>Sweden</td>\n",
       "      <td>1802</td>\n",
       "      <td>Kingdom of Sweden</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.699</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.09</td>\n",
       "      <td>-10.0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.046</td>\n",
       "      <td>1194.0</td>\n",
       "      <td>0.060</td>\n",
       "      <td>147.92</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>365</th>\n",
       "      <td>Sweden</td>\n",
       "      <td>1803</td>\n",
       "      <td>Kingdom of Sweden</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.699</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.09</td>\n",
       "      <td>-10.0</td>\n",
       "      <td>2</td>\n",
       "      <td>-0.010</td>\n",
       "      <td>1182.0</td>\n",
       "      <td>0.061</td>\n",
       "      <td>149.38</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>366</th>\n",
       "      <td>Sweden</td>\n",
       "      <td>1804</td>\n",
       "      <td>Kingdom of Sweden</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.699</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.09</td>\n",
       "      <td>-10.0</td>\n",
       "      <td>2</td>\n",
       "      <td>-0.065</td>\n",
       "      <td>1105.0</td>\n",
       "      <td>0.061</td>\n",
       "      <td>150.84</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 29 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    country_name  year           histname  v2x_suffr  v2x_liberal v2lpname  \\\n",
       "362       Sweden  1800  Kingdom of Sweden        0.0        0.699      NaN   \n",
       "363       Sweden  1801  Kingdom of Sweden        0.0        0.699      NaN   \n",
       "364       Sweden  1802  Kingdom of Sweden        0.0        0.699      NaN   \n",
       "365       Sweden  1803  Kingdom of Sweden        0.0        0.699      NaN   \n",
       "366       Sweden  1804  Kingdom of Sweden        0.0        0.699      NaN   \n",
       "\n",
       "    v2exnamhog v2extithog  v2peprisch  v3stcensus  ...  v2x_gencs  e_polity2  \\\n",
       "362        NaN        NaN         NaN         1.0  ...       0.09      -10.0   \n",
       "363        NaN        NaN         NaN         0.0  ...       0.09      -10.0   \n",
       "364        NaN        NaN         NaN         0.0  ...       0.09      -10.0   \n",
       "365        NaN        NaN         NaN         0.0  ...       0.09      -10.0   \n",
       "366        NaN        NaN         NaN         0.0  ...       0.09      -10.0   \n",
       "\n",
       "     e_regiongeo  e_migdpgro  e_migdppc  e_miurbani  e_miurbpop  e_civil_war  \\\n",
       "362            2      -0.036     1151.0       0.058      145.00          NaN   \n",
       "363            2      -0.009     1141.0       0.059      146.46          NaN   \n",
       "364            2       0.046     1194.0       0.060      147.92          NaN   \n",
       "365            2      -0.010     1182.0       0.061      149.38          NaN   \n",
       "366            2      -0.065     1105.0       0.061      150.84          NaN   \n",
       "\n",
       "     e_miinteco  e_miinterc  \n",
       "362         0.0         0.0  \n",
       "363         0.0         0.0  \n",
       "364         0.0         0.0  \n",
       "365         0.0         0.0  \n",
       "366         0.0         0.0  \n",
       "\n",
       "[5 rows x 29 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "west.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Export data to Stata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-01T20:00:34.105388Z",
     "start_time": "2021-05-01T20:00:34.054310Z"
    }
   },
   "outputs": [],
   "source": [
    "west.to_stata(DATA + \"west_vdem10.dta\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
